{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Model Upload"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip is available: \u001B[0m\u001B[31;49m23.0.1\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m23.2.1\u001B[0m\n",
      "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip install --upgrade pip\u001B[0m\n"
     ]
    }
   ],
   "source": [
    "# Install dependencies\n",
    "!pip install -q numerapi pandas lightgbm cloudpickle pyarrow scikit-learn scipy==1.10.1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2023-08-30 17:40:06,596 INFO numerapi.utils: target file already exists\n",
      "2023-08-30 17:40:06,597 INFO numerapi.utils: download complete\n",
      "2023-08-30 17:40:07,114 INFO numerapi.utils: target file already exists\n",
      "2023-08-30 17:40:07,115 INFO numerapi.utils: download complete\n"
     ]
    }
   ],
   "source": [
    "from numerapi import NumerAPI\n",
    "import pandas as pd\n",
    "import json\n",
    "napi = NumerAPI()\n",
    "\n",
    "# Download data\n",
    "napi.download_dataset(\"v4.2/train_int8.parquet\");\n",
    "napi.download_dataset(\"v4.2/features.json\");\n",
    "\n",
    "# Load data\n",
    "feature_metadata = json.load(open(\"v4.2/features.json\")) \n",
    "features = feature_metadata[\"feature_sets\"][\"medium\"] # use \"all\" for better performance. Requires more RAM.\n",
    "train = pd.read_parquet(\"v4.2/train_int8.parquet\", columns=[\"era\"]+features+[\"target\"])\n",
    "\n",
    "# For better models, join train and validation data and train on all of it\n",
    "# napi.download_dataset(\"v4.2/validation_int8.parquet\");\n",
    "# validation = pd.read_parquet(\"v4.2/validation_int8.parquet\", columns=[\"era\"]+features+[\"target\"])\n",
    "# validation = validation[validation[\"data_type\"] == \"validation\"] # drop rows which don't have targets yet\n",
    "# train = pd.concat([train, validation])\n",
    "\n",
    "# Downsample for speed\n",
    "train = train[train[\"era\"].isin(train[\"era\"].unique()[::4])]  # skip this step for better performance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n",
      "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n",
      "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014111 seconds.\n",
      "You can set `force_row_wise=true` to remove the overhead.\n",
      "And if memory is not enough, you can set `force_col_wise=true`.\n",
      "[LightGBM] [Info] Total Bins 2915\n",
      "[LightGBM] [Info] Number of data points in the train set: 606176, number of used features: 583\n",
      "[LightGBM] [Info] Start training from score 0.499979\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n"
     ]
    }
   ],
   "source": [
    "# Train model\n",
    "import lightgbm as lgb\n",
    "model = lgb.LGBMRegressor(\n",
    "    n_estimators=2000,  # If you want to use a larger model we've found 20_000 trees to be better\n",
    "    learning_rate=0.01, # and a learning rate of 0.001\n",
    "    max_depth=5, # and max_depth=6\n",
    "    num_leaves=2**5-1, # and num_leaves of 2**6-1\n",
    "    colsample_bytree=0.1\n",
    ")\n",
    "model.fit(\n",
    "    train[features],\n",
    "    train[\"target\"]\n",
    ");"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define predict function\n",
    "def predict(live_features: pd.DataFrame) -> pd.DataFrame:\n",
    "    live_predictions = model.predict(live_features[features])\n",
    "    submission = pd.Series(live_predictions, index=live_features.index)\n",
    "    return submission.to_frame(\"prediction\")\n",
    "\n",
    "# Pickle predict function\n",
    "import cloudpickle\n",
    "p = cloudpickle.dumps(predict)\n",
    "with open(\"predict_barebones.pkl\", \"wb\") as f:\n",
    "    f.write(p)\n",
    "\n",
    "# Download file if running in Google Colab\n",
    "try:\n",
    "    from google.colab import files\n",
    "    files.download('predict_barebones.pkl')\n",
    "except:\n",
    "    pass"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}